First, let’s look at the TE breakpoint analysis
Dvir_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dvir_breakpoint_TEs_forR.csv', header=T)
Dvir_breakpoints
## Ref_species Inversion Type base_pairs
## 1 Dvir In2b_d DAIBAM 0
## 2 Dvir In2b_d DNA 0
## 3 Dvir In2b_d LTR 0
## 4 Dvir In2b_d LINE 0
## 5 Dvir In2b_d Unknown 0
## 6 Dvir In2b_d Other 0
## 7 Dvir In2b_d RC 0
## 8 Dvir In2b_p DAIBAM 0
## 9 Dvir In2b_p DNA 0
## 10 Dvir In2b_p LTR 649
## 11 Dvir In2b_p LINE 0
## 12 Dvir In2b_p Unknown 323
## 13 Dvir In2b_p Other 0
## 14 Dvir In2b_p RC 3817
## 15 Dvir In2c_d DAIBAM 0
## 16 Dvir In2c_d DNA 0
## 17 Dvir In2c_d LTR 136
## 18 Dvir In2c_d LINE 129
## 19 Dvir In2c_d Unknown 111
## 20 Dvir In2c_d Other 0
## 21 Dvir In2c_d RC 112
## 22 Dvir In2c_p DAIBAM 0
## 23 Dvir In2c_p DNA 0
## 24 Dvir In2c_p LTR 0
## 25 Dvir In2c_p LINE 0
## 26 Dvir In2c_p Unknown 88
## 27 Dvir In2c_p Other 0
## 28 Dvir In2c_p RC 0
## 29 Dvir In2a_d DAIBAM 738
## 30 Dvir In2a_d DNA 0
## 31 Dvir In2a_d LTR 0
## 32 Dvir In2a_d LINE 0
## 33 Dvir In2a_d Unknown 506
## 34 Dvir In2a_d Other 0
## 35 Dvir In2a_d RC 0
## 36 Dvir In2a_p DAIBAM 1249
## 37 Dvir In2a_p DNA 455
## 38 Dvir In2a_p LTR 297
## 39 Dvir In2a_p LINE 52
## 40 Dvir In2a_p Unknown 691
## 41 Dvir In2a_p Other 0
## 42 Dvir In2a_p RC 1833
## 43 Dvir In4a_d DAIBAM 0
## 44 Dvir In4a_d DNA 0
## 45 Dvir In4a_d LTR 0
## 46 Dvir In4a_d LINE 0
## 47 Dvir In4a_d Unknown 0
## 48 Dvir In4a_d Other 0
## 49 Dvir In4a_d RC 0
## 50 Dvir In4a_p DAIBAM 0
## 51 Dvir In4a_p DNA 0
## 52 Dvir In4a_p LTR 0
## 53 Dvir In4a_p LINE 0
## 54 Dvir In4a_p Unknown 0
## 55 Dvir In4a_p Other 0
## 56 Dvir In4a_p RC 0
## 57 Dvir In5b_d DAIBAM 0
## 58 Dvir In5b_d DNA 0
## 59 Dvir In5b_d LTR 68
## 60 Dvir In5b_d LINE 0
## 61 Dvir In5b_d Unknown 77
## 62 Dvir In5b_d Other 0
## 63 Dvir In5b_d RC 0
## 64 Dvir In5b_p DAIBAM 0
## 65 Dvir In5b_p DNA 0
## 66 Dvir In5b_p LTR 0
## 67 Dvir In5b_p LINE 0
## 68 Dvir In5b_p Unknown 0
## 69 Dvir In5b_p Other 0
## 70 Dvir In5b_p RC 139
## 71 Dvir InXc_d DAIBAM 0
## 72 Dvir InXc_d DNA 0
## 73 Dvir InXc_d LTR 0
## 74 Dvir InXc_d LINE 0
## 75 Dvir InXc_d Unknown 0
## 76 Dvir InXc_d Other 0
## 77 Dvir InXc_d RC 0
## 78 Dvir InXc_p DAIBAM 0
## 79 Dvir InXc_p DNA 0
## 80 Dvir InXc_p LTR 786
## 81 Dvir InXc_p LINE 0
## 82 Dvir InXc_p Unknown 322
## 83 Dvir InXc_p Other 0
## 84 Dvir InXc_p RC 423
## 85 Dvir InXb_d DAIBAM 0
## 86 Dvir InXb_d DNA 0
## 87 Dvir InXb_d LTR 0
## 88 Dvir InXb_d LINE 0
## 89 Dvir InXb_d Unknown 0
## 90 Dvir InXb_d Other 0
## 91 Dvir InXb_d RC 0
## 92 Dvir InXb_p DAIBAM 0
## 93 Dvir InXb_p DNA 1006
## 94 Dvir InXb_p LTR 2149
## 95 Dvir InXb_p LINE 2262
## 96 Dvir InXb_p Unknown 644
## 97 Dvir InXb_p Other 0
## 98 Dvir InXb_p RC 8601
## 99 Dvir InXa_d DAIBAM 555
## 100 Dvir InXa_d DNA 0
## 101 Dvir InXa_d LTR 0
## 102 Dvir InXa_d LINE 0
## 103 Dvir InXa_d Unknown 0
## 104 Dvir InXa_d Other 0
## 105 Dvir InXa_d RC 0
## 106 Dvir InXa_p DAIBAM 0
## 107 Dvir InXa_p DNA 0
## 108 Dvir InXa_p LTR 0
## 109 Dvir InXa_p LINE 0
## 110 Dvir InXa_p Unknown 0
## 111 Dvir InXa_p Other 0
## 112 Dvir InXa_p RC 0
library(ggplot2)
## Warning in as.POSIXlt.POSIXct(Sys.time()): unknown timezone 'zone/tz/2021a.
## 2.0/zoneinfo/America/New_York'
ggplot() +
geom_bar(data = Dvir_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
theme(legend.title=element_blank()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Percent of region") +
theme(text = element_text(size=15))
## Warning: Removed 42 rows containing missing values (geom_bar).
Dnov_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dnov_breakpoints_TEs_forR.csv', header=T)
ggplot() +
geom_bar(data = Dnov_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
theme(legend.title=element_blank()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Percent of region") +
theme(text = element_text(size=15))
## Warning: Removed 7 rows containing missing values (geom_bar).
Dame_breakpoints <- read.csv('/Users/jullienflynn/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dame_breakpoints_TEs_forR.csv', header=T)
ggplot() +
geom_bar(data = Dame_breakpoints, aes(y = base_pairs, x = Inversion, fill = factor(Type, levels = c("DAIBAM", "DNA", "LTR", "LINE", "Unknown", "Other", "RC"))), stat = "identity", position = 'fill') +
scale_fill_manual(values = c("#FFCC00", "#33CC66", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC", "#FF6600")) +
scale_y_continuous(labels = scales::percent) +
theme_bw() +
theme(legend.title=element_blank()) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylab("Percent of region") +
theme(text = element_text(size=15))
## Warning: Removed 42 rows containing missing values (geom_bar).
Genome-wide TE annotation: pie charts
library("ggplot2")
Dvir_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dvir_masking.csv', header=T)
Dvir_masking
## Type nt_masked total_nt perc_mask
## 1 LINE 10245012 189680791 5.401186
## 2 DNA 2823009 189680791 1.488295
## 3 LTR 21901494 189680791 11.546501
## 4 Helitron/DINEs 12751857 189680791 6.722798
## 5 Other 3053549 189680791 1.609836
## 6 Unknown 5100259 189680791 2.688864
## 7 Non-TE 133805611 189680791 70.542521
Dvir_masking$Type <- factor(Dvir_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dvir_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
# virilis old genome.
Dvir_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/virilis_caf1.csv', header=T)
Dvir_masking
## Type masked_nt total_nt perc_mask
## 1 LTR 17456713 189206400 9.2262804
## 2 LINE 9588644 189206400 5.0678222
## 3 DNA 1338553 189206400 0.7074565
## 4 RC 5670391 189206400 2.9969340
## 5 Unknown 21340230 189206400 11.2788098
## 6 Non-TE 133811869 189206400 70.7226970
Dvir_masking$Type <- factor(Dvir_masking$Type, levels=c("DNA", "RC", "LTR", "LINE", "Unknown", "Non-TE"))
ggplot (Dvir_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#00CCFF", "#CCCCCC")) +
theme_void()
# Dnovamexicana
Dnov_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dnov_masking.csv', header=T)
Dnov_masking
## Type nt_masked total_nt perc_mask
## 1 LINE 6281676 182394971 3.4439963
## 2 DNA 894270 182394971 0.4902931
## 3 LTR 11169301 182394971 6.1236891
## 4 Helitron/DINEs 11289884 182394971 6.1898000
## 5 Other 7673241 182394971 4.2069367
## 6 Unknown 5411510 182394971 2.9669184
## 7 Non-TE 139675089 182394971 76.5783663
Dnov_masking$Type <- factor(Dnov_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dnov_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
# D americana
Dame_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/TE_landscapes/Dame_masking.csv', header=T)
Dame_masking
## Type nt_masked total_nt perc_mask
## 1 LINE 8062670 211471196 3.8126564
## 2 DNA 1288628 211471196 0.6093634
## 3 LTR 10142700 211471196 4.7962560
## 4 Helitron/DINEs 28521833 211471196 13.4873371
## 5 Other 9166368 211471196 4.3345705
## 6 Unknown 4845328 211471196 2.2912473
## 7 Non-TE 149443669 211471196 70.6685694
Dame_masking$Type <- factor(Dame_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dame_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
Genome-wide and Y-specific TE annotation: landscape plots
Dvir_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Apr9_dvir_mod/Dvir_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dvir_landscape$V1
types
## [1] Other LTR DNA DINE LINE RC Unknown
## Levels: DINE DNA LINE LTR Other RC Unknown
Dvir_landscape$V1 <- NULL
rownames(Dvir_landscape) <- types
RC_DINE <- Dvir_landscape["DINE",] + Dvir_landscape["RC",]
to_remove <- c("DINE", "RC")
Dvir_landscape_edit <- Dvir_landscape[!row.names(Dvir_landscape)%in%to_remove,]
Dvir_landscape_edit2 <- rbind(Dvir_landscape_edit, RC_DINE)
Dvir_landscape_edit2
## V2 V3 V4 V5 V6 V7 V8 V9
## Other 32 88 40 293 47 280 1798 1366
## LTR 2759332 1397409 1317267 1275702 1089392 1326759 1156997 1024066
## DNA 198823 54226 116348 393031 417977 353076 180435 177560
## LINE 888723 1825663 786903 765242 544501 474068 555860 329650
## Unknown 212580 83833 107727 106931 138050 156283 127209 171456
## DINE 145202 118866 157942 330471 626207 817256 803570 827175
## V10 V11 V12 V13 V14 V15 V16 V17 V18
## Other 5463 29386 75226 117274 159211 298042 461193 339794 382328
## LTR 976129 745479 751488 704309 675488 720830 639797 549795 442516
## DNA 105077 91533 65248 67084 58969 58590 55899 47702 43585
## LINE 232108 164667 185111 234833 265122 192473 168230 175475 184472
## Unknown 138048 146374 196972 199331 146371 155603 148140 180721 179929
## DINE 1061538 1172542 1030390 555089 345377 323574 340841 425320 356125
## V19 V20 V21 V22 V23 V24 V25 V26 V27
## Other 232149 357786 246011 173314 52782 41802 35676 22189 8836
## LTR 405957 402708 401342 338518 277580 297462 257488 186664 215339
## DNA 48646 40907 35617 34734 32283 23356 22798 21464 19854
## LINE 189020 201204 189857 195557 200234 174941 180720 186384 166611
## Unknown 179076 214708 215945 267813 272711 309622 219828 224705 198708
## DINE 362969 348372 334102 371618 405743 351483 291181 238222 201068
## V28 V29 V30 V31 V32 V33 V34 V35 V36 V37
## Other 3445 4166 1609 1448 89 263 107 16 0 0
## LTR 195932 168675 192037 189576 230217 157962 122727 82590 37683 24743
## DNA 12098 8392 17669 8791 4888 4002 1468 420 398 0
## LINE 140322 109860 79331 67316 56746 57216 27478 13627 11726 9881
## Unknown 122138 103013 64690 43788 28373 20899 8992 4974 3387 653
## DINE 148273 102247 70323 40556 24305 13928 6778 2149 741 266
## V38 V39 V40 V41
## Other 0 0 0 0
## LTR 14244 3773 2469 680
## DNA 61 0 0 0
## LINE 2904 8443 2533 0
## Unknown 588 14 76 0
## DINE 0 14 0 0
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dvir_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dvir_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dvir_landscape_edit2[i,1:ncol(Dvir_landscape_edit2)]))
}
Dvir_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_landscape_df)
## Type Window Prop
## 1 Other 1 32
## 2 Other 2 88
## 3 Other 3 40
## 4 Other 4 293
## 5 Other 5 47
## 6 Other 6 280
# still need to divide by the appropriate number
Dvir_genome_size <- 189680791
Dvir_landscape_df$Prop <- Dvir_landscape_df$Prop/Dvir_genome_size
# remove the SINE
Dvir_landscape_df_mod <- Dvir_landscape_df[which(Dvir_landscape_df$Type!="SINE"),]
Dvir_landscape_df_mod$Type <- factor(Dvir_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dvir_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
ylim(0,0.0225) +
theme_bw()
#### Dvir Y chrom ####
Dvir_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dvir_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dvir_Y_landscape$V1
types
## [1] LTR RC LINE Unknown DINE DNA
## Levels: DINE DNA LINE LTR RC Unknown
Dvir_Y_landscape$V1 <- NULL
rownames(Dvir_Y_landscape) <- types
RC_DINE <- Dvir_Y_landscape["DINE",] + Dvir_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dvir_Y_landscape_edit <- Dvir_Y_landscape[!row.names(Dvir_Y_landscape)%in%to_remove,]
Dvir_Y_landscape_edit2 <- rbind(Dvir_Y_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dvir_Y_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dvir_Y_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dvir_Y_landscape_edit2[i,1:ncol(Dvir_Y_landscape_edit2)]))
}
Dvir_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_Y_landscape_df)
## Type Window Prop
## 1 LTR 1 387055
## 2 LTR 2 452240
## 3 LTR 3 600157
## 4 LTR 4 603886
## 5 LTR 5 542391
## 6 LTR 6 549035
# still need to divide by the appropriate number
Dvir_Y_size <- 14824018
Dvir_Y_landscape_df$Prop <- Dvir_Y_landscape_df$Prop/Dvir_Y_size
# remove the SINE
Dvir_Y_landscape_df_mod <- Dvir_Y_landscape_df[which(Dvir_Y_landscape_df$Type!="SINE"),]
Dvir_Y_landscape_df_mod$Type <- factor(Dvir_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dvir_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#00CCFF")) +
ylim(0,0.125) +
theme_bw()
######
##Dnov
Dnov_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dnov_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dnov_landscape$V1
types
## [1] LINE Other DNA Unknown LTR RC SINE DINE
## Levels: DINE DNA LINE LTR Other RC SINE Unknown
Dnov_landscape$V1 <- NULL
rownames(Dnov_landscape) <- types
RC_DINE <- Dnov_landscape["DINE",] + Dnov_landscape["RC",]
to_remove <- c("DINE", "RC")
Dnov_landscape_edit <- Dnov_landscape[!row.names(Dnov_landscape)%in%to_remove,]
Dnov_landscape_edit2 <- rbind(Dnov_landscape_edit, RC_DINE)
Dnov_landscape_edit2
## V2 V3 V4 V5 V6 V7 V8 V9 V10
## LINE 666640 729920 437873 355817 312038 301572 290409 165507 133375
## Other 29001 991 468 153232 25881 53056 238413 70350 127523
## DNA 61443 25899 53278 57070 40388 38933 28909 31969 29020
## Unknown 576183 89122 112566 160143 135381 121178 94068 95364 82662
## LTR 1370306 739714 611444 570504 596322 458092 428656 418732 438645
## SINE 0 0 0 0 0 222 146 0 440
## DINE 22016 110777 130047 193779 447657 782248 1039331 1132728 885255
## V11 V12 V13 V14 V15 V16 V17 V18 V19
## LINE 124750 143252 122111 143739 124972 134259 163202 155796 143143
## Other 207687 100252 429010 452917 597315 847750 1794867 1412014 526405
## DNA 39984 32846 27321 24952 22282 18774 29377 40824 48038
## Unknown 90219 104636 93560 126420 154453 140755 161782 166137 192379
## LTR 389911 424319 328714 228280 262493 253967 243258 213140 230401
## SINE 74 1120 0 399 136 0 360 150 75
## DINE 731183 582840 454148 365468 348149 364206 400064 471029 270405
## V20 V21 V22 V23 V24 V25 V26 V27 V28
## LINE 145807 148281 143347 134430 122546 127465 108247 91694 84593
## Other 336611 94241 81006 45266 11753 9658 15678 4317 2839
## DNA 37417 34165 24274 22080 21838 22812 17225 14084 10571
## Unknown 210071 239042 266531 286149 282942 324164 262746 236653 180718
## LTR 239281 211439 254110 314319 187124 158578 149229 156721 151432
## SINE 0 0 0 0 114 0 0 0 0
## DINE 318400 284890 294826 282726 285688 288269 222889 189121 148098
## V29 V30 V31 V32 V33 V34 V35 V36 V37 V38
## LINE 69338 65137 76447 79895 68733 59970 42895 20355 26152 10466
## Other 1463 1200 783 688 198 127 0 135 146 0
## DNA 10617 8748 5287 2489 3991 1175 1703 1848 304 0
## Unknown 150874 107046 72175 41392 24904 11675 9240 4321 932 1869
## LTR 150721 174365 206094 174096 127394 123091 98294 48947 23265 9356
## SINE 0 0 0 0 0 0 0 0 0 0
## DINE 103318 67281 33170 21661 10181 5969 1119 579 369 0
## V39 V40 V41
## LINE 5211 2221 27
## Other 0 0 0
## DNA 0 0 0
## Unknown 652 406 0
## LTR 3335 771 0
## SINE 0 0 0
## DINE 0 0 0
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dnov_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dnov_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dnov_landscape_edit2[i,1:ncol(Dnov_landscape_edit2)]))
}
Dnov_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
Dnov_genome_size <- 182394971
Dnov_landscape_df$Prop <- Dnov_landscape_df$Prop/Dnov_genome_size
# remove the SINE
Dnov_landscape_df_mod <- Dnov_landscape_df[which(Dnov_landscape_df$Type!="SINE"),]
Dnov_landscape_df_mod$Type <- factor(Dnov_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dnov_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
ylim(0,0.0225) +
theme_bw()
#### Dnov Y chrom ###
Dnov_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dnov_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dnov_Y_landscape$V1
Dnov_Y_landscape$V1 <- NULL
rownames(Dnov_Y_landscape) <- types
RC_DINE <- Dnov_Y_landscape["DINE",] + Dnov_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dnov_Y_landscape_edit <- Dnov_Y_landscape[!row.names(Dnov_Y_landscape)%in%to_remove,]
Dnov_Y_landscape_edit2 <- rbind(Dnov_Y_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dnov_Y_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dnov_Y_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dnov_Y_landscape_edit2[i,1:ncol(Dnov_Y_landscape_edit2)]))
}
Dnov_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
Dnov_Y_size <- 8975053
Dnov_Y_landscape_df$Prop <- Dnov_Y_landscape_df$Prop/Dnov_Y_size
# remove the SINE
Dnov_Y_landscape_df_mod <- Dnov_Y_landscape_df[which(Dnov_Y_landscape_df$Type!="SINE"),]
Dnov_Y_landscape_df_mod$Type <- factor(Dnov_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dnov_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
ylim(0,0.125) +
theme_bw()
### Now Dame ####
Dame_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dame_genome.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dame_landscape$V1
Dame_landscape$V1 <- NULL
rownames(Dame_landscape) <- types
RC_DINE <- Dame_landscape["DINE",] + Dame_landscape["RC",]
to_remove <- c("DINE", "RC")
Dame_landscape_edit <- Dame_landscape[!row.names(Dame_landscape)%in%to_remove,]
Dame_landscape_edit2 <- rbind(Dame_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dame_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dame_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dame_landscape_edit2[i,1:ncol(Dame_landscape_edit2)]))
}
Dame_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
Dame_genome_size <- 211471196
Dame_landscape_df$Prop <- Dame_landscape_df$Prop/Dame_genome_size
# remove the SINE
Dame_landscape_df_mod <- Dame_landscape_df[which(Dame_landscape_df$Type!="SINE"),]
Dame_landscape_df_mod$Type <- factor(Dame_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dame_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
ylim(0,0.0225) +
theme_bw()
#### Dame Y ######
Dame_Y_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/Feb7/Dame_Ycontigs.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- Dame_Y_landscape$V1
Dame_Y_landscape$V1 <- NULL
rownames(Dame_Y_landscape) <- types
RC_DINE <- Dame_Y_landscape["DINE",] + Dame_Y_landscape["RC",]
to_remove <- c("DINE", "RC")
Dame_Y_landscape_edit <- Dame_Y_landscape[!row.names(Dame_Y_landscape)%in%to_remove,]
Dame_Y_landscape_edit2 <- rbind(Dame_Y_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dame_Y_landscape_edit2)) {
Types <- c(Types, rep(rownames(Dame_Y_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dame_Y_landscape_edit2[i,1:ncol(Dame_Y_landscape_edit2)]))
}
Dame_Y_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
Dame_Y_size <- 24076955
Dame_Y_landscape_df$Prop <- Dame_Y_landscape_df$Prop/Dame_Y_size
# remove the SINE
Dame_Y_landscape_df_mod <- Dame_Y_landscape_df[which(Dame_Y_landscape_df$Type!="SINE"),]
Dame_Y_landscape_df_mod$Type <- factor(Dame_Y_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(Dame_Y_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
ylim(0,0.125) +
theme_bw()
Now, look specifically at LINE differences
Dvir_LINEs <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/LINEs/Dvir_landscape.LINE.Rfam.txt', header=F, sep="\t")
Dvir_LINEs$V1 <- NULL
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dvir_LINEs)) {
Types <- c(Types, rep(as.character(Dvir_LINEs[i,1]), times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dvir_LINEs[i,2:ncol(Dvir_LINEs)]))
}
Dvir_LINEs_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dvir_LINEs_df)
## Type Window Prop
## 1 I-Jockey 1 103793
## 2 I-Jockey 2 456773
## 3 I-Jockey 3 148221
## 4 I-Jockey 4 400683
## 5 I-Jockey 5 249661
## 6 I-Jockey 6 141288
# still need to divide by the appropriate number
Dvir_genome_size <- 189680791
Dvir_LINEs_df$Prop <- Dvir_LINEs_df$Prop/Dvir_genome_size
# remove the SINE
Dvir_LINEs_df_mod <- Dvir_LINEs_df[which(Dvir_LINEs_df$Type!="LINE" & Dvir_LINEs_df$Type!="L2" & Dvir_LINEs_df$Type!="R1-LOA" & Dvir_LINEs_df$Type!="I"),]
Dvir_LINEs_df_mod$Type <- factor(Dvir_LINEs_df_mod$Type, levels=c("CR1", "I-Jockey", "Penelope", "R1"))
# stacked barplot
ggplot(Dvir_LINEs_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#FFCC00", "#9966FF")) +
#ylim(0,0.037) +
theme_bw()
## now Dnov
Dnov_LINEs <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/landscape/LINEs/Dnov_landscape.LINE.Rfam.txt', header=F, sep="\t")
Dnov_LINEs$V1 <- NULL
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(Dnov_LINEs)) {
Types <- c(Types, rep(as.character(Dnov_LINEs[i,1]), times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(Dnov_LINEs[i,2:ncol(Dnov_LINEs)]))
}
Dnov_LINEs_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
head(Dnov_LINEs_df)
## Type Window Prop
## 1 I-Jockey 1 460641
## 2 I-Jockey 2 252512
## 3 I-Jockey 3 64731
## 4 I-Jockey 4 20796
## 5 I-Jockey 5 28060
## 6 I-Jockey 6 39746
# still need to divide by the appropriate number
Dnov_genome_size <- 182394971
Dnov_LINEs_df$Prop <- Dnov_LINEs_df$Prop/Dnov_genome_size
# remove the SINE
Dnov_LINEs_df_mod <- Dnov_LINEs_df[which(Dnov_LINEs_df$Type!="LINE" & Dnov_LINEs_df$Type!="L2" & Dnov_LINEs_df$Type!="R1-LOA" & Dnov_LINEs_df$Type!="I"),]
Dnov_LINEs_df_mod$Type <- factor(Dnov_LINEs_df_mod$Type, levels=c("CR1", "I-Jockey", "Penelope", "R1"))
# stacked barplot
ggplot(Dnov_LINEs_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#FFCC00", "#9966FF")) +
ylim(0,0.01) +
theme_bw()
Now, Y chromosome only annotation: pie charts
Dame_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dame_Y_masking.csv', header=T)
Dame_Y
## Type nt_masked total_nt perc_mask
## 1 LINE 1906166 24076955 7.916972890
## 2 DNA 40256 24076955 0.167197222
## 3 LTR 3899143 24076955 16.194502170
## 4 Helitron/DINEs 16691516 24076955 69.325693390
## 5 Other 694 24076955 0.002882424
## 6 Unknown 126786 24076955 0.526586522
## 7 Non-TE 1412394 24076955 5.866165385
Dame_Y$Type <- factor(Dame_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dame_Y, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
### everything but Y
Dame_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dame_allbutY_masking.csv', header=T)
Dame_nonY
## Type nt_masked total_nt perc_mask
## 1 LINE 6153409 187394241 3.2836703
## 2 DNA 1249870 187394241 0.6669735
## 3 LTR 6211930 187394241 3.3148991
## 4 Helitron/DINEs 11828129 187394241 6.3118957
## 5 Other 9154695 187394241 4.8852595
## 6 Unknown 4683844 187394241 2.4994599
## 7 Non-TE 148112364 187394241 79.0378419
Dame_nonY$Type <- factor(Dame_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dame_nonY, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
Dnov_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dnov_Y_masking.csv', header=T)
Dnov_Y
## Type nt_masked total_nt perc_mask
## 1 LINE 1721083 8975053 19.1762990
## 2 DNA 19007 8975053 0.2117759
## 3 LTR 3680934 8975053 41.0129500
## 4 Helitron/DINEs 2203574 8975053 24.5522116
## 5 Other 34497 8975053 0.3843654
## 6 Unknown 244302 8975053 2.7220118
## 7 Non-TE 1071656 8975053 11.9403863
Dnov_Y$Type <- factor(Dame_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dnov_Y, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
### everything but Y
Dnov_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dnov_allbutY_masking.csv', header=T)
Dnov_nonY$Type <- factor(Dnov_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dnov_nonY, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
Dvir_Y <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dvir_Y_masking.csv', header=T)
Dvir_Y
## Type nt_masked total_nt perc_mask
## 1 LINE 1895813 14824018 12.788793
## 2 DNA 435272 14824018 2.936262
## 3 LTR 6596624 14824018 44.499568
## 4 Helitron/DINEs 4344957 14824018 29.310252
## 5 Other 0 14824018 0.000000
## 6 Unknown 323572 14824018 2.182755
## 7 Non-TE 1227780 14824018 8.282370
Dvir_Y$Type <- factor(Dvir_Y$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dvir_Y, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
### everything but Y
Dvir_nonY <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Ychrom/Dvir_allbutY_masking.csv', header=T)
Dvir_nonY
## Type nt_masked total_nt perc_mask
## 1 LINE 8349199 174856773 4.774879
## 2 DNA 2387737 174856773 1.365539
## 3 LTR 15162711 174856773 8.671503
## 4 Helitron/DINEs 8406865 174856773 4.807858
## 5 Other 3053549 174856773 1.746314
## 6 Unknown 4776687 174856773 2.731771
## 7 Non-TE 132720025 174856773 75.902136
Dvir_nonY$Type <- factor(Dvir_nonY$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (Dvir_nonY, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
Now, chromosome-level TE density in 100 kb windows
Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_100kb.intervals.sizes', header=F, sep="\t")
Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_100kb.intervals_TE.density', header=F, sep="\t", as.is = T)
#Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_intervals.sizes', header=F, sep="\t")
#Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dvir_intervals_TE.density', header=F, sep="\t", as.is = T)
Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dame_100kb.intervals.sizes.redo', header=F, sep="\t")
Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dame_100kb.intervals_TE.density.redo', header=F, sep="\t", as.is = T)
Dvir_interval_sizes <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dnov_100kb.intervals.sizes', header=F, sep="\t")
Dvir_TEdens <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/piRNA_sequencing/Spring2020/Muller/Dnov_100kb.intervals_TE.density', header=F, sep="\t", as.is = T)
sizes <- as.numeric(Dvir_interval_sizes$V2)
cluster_labels <- Dvir_TEdens$V1
Dvir_TEdens$V1 <- NULL
rownames(Dvir_TEdens) <- cluster_labels
colnames(Dvir_TEdens) <- c("LTR", "RC", "LINE", "DNA", "Unknown")
head(Dvir_TEdens)
## LTR RC LINE DNA Unknown NA
## Chr_2-0-100000 1698 133 5594 303 20020 986
## Chr_2-100000-200000 843 0 9506 115 123 1326
## Chr_2-200000-300000 1019 305 3140 0 30728 1558
## Chr_2-300000-400000 358 623 2487 0 460 1965
## Chr_2-400000-500000 328 1397 2766 65 14207 3720
## Chr_2-500000-600000 566 447 2093 1121 42 2305
Dvir_TEdens_mx <- data.matrix(Dvir_TEdens)
Dvir_TEdens_norm <- Dvir_TEdens_mx/sizes
nrow(Dvir_TEdens_norm)
## [1] 1565
length(sizes) # need to rerun to get the correct size
## [1] 1565
head(Dvir_TEdens_norm)
## LTR RC LINE DNA Unknown <NA>
## Chr_2-0-100000 0.01698 0.00133 0.05594 0.00303 0.20020 0.00986
## Chr_2-100000-200000 0.00843 0.00000 0.09506 0.00115 0.00123 0.01326
## Chr_2-200000-300000 0.01019 0.00305 0.03140 0.00000 0.30728 0.01558
## Chr_2-300000-400000 0.00358 0.00623 0.02487 0.00000 0.00460 0.01965
## Chr_2-400000-500000 0.00328 0.01397 0.02766 0.00065 0.14207 0.03720
## Chr_2-500000-600000 0.00566 0.00447 0.02093 0.01121 0.00042 0.02305
# now need to put into a different format for plotting
Types <- c()
clusters <- c()
Proportion <- c()
IDs <- c()
for (i in 1:nrow(Dvir_TEdens_norm)) {
Types <- c(Types, "LTR", "LINE", "RC", "DNA", "Other", "Unknown")
clusters <- c(clusters, rep(rownames(Dvir_TEdens_norm)[i], times=6))
Proportion <- c(Proportion, as.numeric(Dvir_TEdens_norm[i,1:ncol(Dvir_TEdens_norm)]))
IDs <- c(IDs, rep (i, times=6))
}
#Dvir_TEdens_df <- data.frame(Clusters=clusters, Type=Types, Prop=Proportion, Chr=chrom)
Dvir_TEdens_df <- data.frame(Clusters=clusters, Type=Types, Prop=Proportion, ID=IDs)
head(Dvir_TEdens_df)
## Clusters Type Prop ID
## 1 Chr_2-0-100000 LTR 0.01698 1
## 2 Chr_2-0-100000 LINE 0.00133 1
## 3 Chr_2-0-100000 RC 0.05594 1
## 4 Chr_2-0-100000 DNA 0.00303 1
## 5 Chr_2-0-100000 Other 0.20020 1
## 6 Chr_2-0-100000 Unknown 0.00986 1
temp1 <- strsplit(as.character(Dvir_TEdens_df$Clusters), "-")
Dvir_TEdens_df$Chr <- sapply(temp1, function(x) x[1])
Dvir_TEdens_df$Type <- factor(Dvir_TEdens_df$Type, levels=c("DNA", "RC", "LTR", "LINE", "Other", "Unknown"))
# make a plot for each Chr like piRNA clusters.
Dvir_TEdens_df_Chr2 <- subset(Dvir_TEdens_df, Chr == "Chr_2")
head(Dvir_TEdens_df_Chr2)
## Clusters Type Prop ID Chr
## 1 Chr_2-0-100000 LTR 0.01698 1 Chr_2
## 2 Chr_2-0-100000 LINE 0.00133 1 Chr_2
## 3 Chr_2-0-100000 RC 0.05594 1 Chr_2
## 4 Chr_2-0-100000 DNA 0.00303 1 Chr_2
## 5 Chr_2-0-100000 Other 0.20020 1 Chr_2
## 6 Chr_2-0-100000 Unknown 0.00986 1 Chr_2
# keep working on this.
library(ggplot2)
# x should be ID, not cluster. the clusters will be sorted wrong by R.
ggplot(Dvir_TEdens_df_Chr2, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1) +
theme_bw() +
theme(axis.text.x=element_blank())
ggplot() +
geom_bar(data = Dvir_TEdens_df_Chr2, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1.1) +
theme_bw() +
theme(axis.text.x=element_blank())
Dvir_TEdens_df_Chr3 <- subset(Dvir_TEdens_df, Chr == "Chr_3")
tail(Dvir_TEdens_df_Chr3)
## Clusters Type Prop ID Chr
## 4045 Chr_3-27200000-27274493 LTR 0.008671956 675 Chr_3
## 4046 Chr_3-27200000-27274493 LINE 0.031278107 675 Chr_3
## 4047 Chr_3-27200000-27274493 RC 0.045628448 675 Chr_3
## 4048 Chr_3-27200000-27274493 DNA 0.001463225 675 Chr_3
## 4049 Chr_3-27200000-27274493 Other 0.000000000 675 Chr_3
## 4050 Chr_3-27200000-27274493 Unknown 0.036379257 675 Chr_3
# i think there's an issue here - fixed it!
# Chr_3-28200000-28253658 9251 2153 44795 2308 0 0
ggplot(Dvir_TEdens_df_Chr3, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,0.8) +
theme_bw() +
theme(axis.text.x=element_blank())
ggplot() +
geom_bar(data = Dvir_TEdens_df_Chr3, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1.1) +
theme_bw() +
theme(axis.text.x=element_blank())
Dvir_TEdens_df_Chr4 <- subset(Dvir_TEdens_df, Chr == "Chr_4" )
ggplot(Dvir_TEdens_df_Chr4, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,0.6) +
theme_bw() +
theme(axis.text.x=element_blank())
ggplot() +
geom_bar(data = Dvir_TEdens_df_Chr4, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1.1) +
theme_bw() +
theme(axis.text.x=element_blank())
Dvir_TEdens_df_Chr5 <- subset(Dvir_TEdens_df, Chr == "Chr_5")
ggplot(Dvir_TEdens_df_Chr5, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1) +
theme_bw() +
theme(axis.text.x=element_blank())
ggplot() +
geom_bar(data = Dvir_TEdens_df_Chr5, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1.1) +
theme_bw() +
theme(axis.text.x=element_blank())
Dvir_TEdens_df_Chr6 <- subset(Dvir_TEdens_df, Chr == "Chr_6")
ggplot(Dvir_TEdens_df_Chr6, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
theme_bw() +
theme(axis.text.x=element_blank())
Dvir_TEdens_df_ChrX <- subset(Dvir_TEdens_df, Chr == "Chr_X")
ggplot(Dvir_TEdens_df_ChrX, aes(x = ID, y = Prop)) +
geom_line(aes(color = Type), alpha = 0.7, size = 0.3) +
scale_color_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1) +
theme_bw() +
theme(axis.text.x=element_blank())
ggplot() +
geom_bar(data = Dvir_TEdens_df_ChrX, mapping = aes(fill=Type, y=Prop, x=ID), position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC","#FFCC00", "#00CCFF")) +
ylim(0,1.1) +
theme_bw() +
theme(axis.text.x=element_blank())
Make histogram of amount of DAIBAM in randomly selected regions in Dnov, of the same size as the breakpoints.
dnov_shuffle_daibam <- read.delim(file='~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/DAIBAM/Dnov_shuffle_DAIBAM.amounts', header=F, sep="\t", as.is = T)
head(dnov_shuffle_daibam)
## V1
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
dnov_shuffle_daibam <- as.vector(dnov_shuffle_daibam$V1)
hist(dnov_shuffle_daibam)
quantile(dnov_shuffle_daibam, probs = c(0.05, 0.95))
## 5% 95%
## 0 120
quantile(dnov_shuffle_daibam, probs = c( 0.95, 0.99, 0.999))
## 95% 99% 99.9%
## 120.000 2065.000 4030.991
Make histograms of ITR array length in the three species
dvir_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dvir.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dvir_ITRs)
## V1 V2 V3 V4
## 1 Chr_2 293710 294023 313
## 2 Chr_2 294464 295271 807
## 3 Chr_2 296106 296445 339
## 4 Chr_2 301009 301359 350
## 5 Chr_2 308059 308420 361
## 6 Chr_2 309077 309433 356
dvir_tohist <- dvir_ITRs$V4[which(dvir_ITRs$V4<20000)]
hist(dvir_tohist)
dnov_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dnov.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dnov_ITRs)
## V1 V2 V3 V4
## 1 Chr_2 92083 92257 174
## 2 Chr_2 117164 117287 123
## 3 Chr_2 204357 204506 149
## 4 Chr_2 248564 279145 30581
## 5 Chr_2 368809 368925 116
## 6 Chr_2 369624 369968 344
dnov_tohist <- dnov_ITRs$V4[which(dnov_ITRs$V4<20000)]
hist(dnov_tohist)
dame_ITRs <- read.delim("~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/ITRs/Dame.ITRs.merged.filtered.bed", header=F, sep="\t", as.is = T)
head(dame_ITRs)
## V1 V2 V3 V4
## 1 Chr_2 221966 223121 1155
## 2 Chr_2 279513 279633 120
## 3 Chr_2 291842 300987 9145
## 4 Chr_2 372159 372306 147
## 5 Chr_2 375389 375494 105
## 6 Chr_2 415783 526485 110702
dam_tohist <- dame_ITRs$V4[which(dame_ITRs$V4<20000)]
#hist(dam_tohist, breaks=c(100,200,500,1000,3000,5000,10000,20000))
hist(dam_tohist)
# next make a histogram with ggplot, limiting the x axis so we don't see these outliers
First, ML97.5
ML975_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/ML975_masking.csv', header=T)
ML975_masking
## Type nt_masked total_nt perc_mask
## 1 LINE 11946512 236071707 5.0605437
## 2 DNA 1234587 236071707 0.5229712
## 3 LTR 11974885 236071707 5.0725625
## 4 Helitron/DINEs 40777050 236071707 17.2731627
## 5 Other 10081978 236071707 4.2707269
## 6 Unknown 5143846 236071707 2.1789337
## 7 Non-TE 154912849 236071707 65.6210992
ML975_masking$Type <- factor(ML975_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (ML975_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
## now the landscape plot ###
ML975_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/DameNanopore_ML975_canu.ME.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- ML975_landscape$V1
ML975_landscape$V1 <- NULL
rownames(ML975_landscape) <- types
RC_DINE <- ML975_landscape["DINE",] + ML975_landscape["RC",]
to_remove <- c("DINE", "RC")
ML975_landscape_edit <- ML975_landscape[!row.names(ML975_landscape)%in%to_remove,]
ML975_landscape_edit2 <- rbind(ML975_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(ML975_landscape_edit2)) {
Types <- c(Types, rep(rownames(ML975_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(ML975_landscape_edit2[i,1:ncol(ML975_landscape_edit2)]))
}
ML975_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
ML975_size <- 236071707
ML975_landscape_df$Prop <- ML975_landscape_df$Prop/ML975_size
# remove the SINE
ML975_landscape_df_mod <- ML975_landscape_df[which(ML975_landscape_df$Type!="SINE"),]
ML975_landscape_df_mod$Type <- factor(ML975_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(ML975_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
#ylim(0,0.025) +
theme_bw()
SB0206
SB0206_masking <- read.csv('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/SB0206_masking.csv', header=T)
SB0206_masking
## Type nt_masked total_nt perc_mask
## 1 LINE 12330108 240772203 5.1210679
## 2 DNA 1340327 240772203 0.5566785
## 3 LTR 13188648 240772203 5.4776456
## 4 Helitron/DINEs 35510110 240772203 14.7484259
## 5 Other 10481989 240772203 4.3534880
## 6 Unknown 5189270 240772203 2.1552613
## 7 Non-TE 162731751 240772203 67.5874328
SB0206_masking$Type <- factor(SB0206_masking$Type, levels=c("DNA", "Helitron/DINEs", "LTR", "LINE", "Other", "Unknown", "Non-TE"))
ggplot (SB0206_masking, aes(x = "", y = perc_mask, fill = Type)) +
geom_bar(width=1, stat="identity") +
coord_polar("y", start=0) +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF", "#CCCCCC")) +
theme_void()
## now the landscape plot ###
SB0206_landscape <- read.delim('~/Documents/AndyLab/Proposal/ThesisWork/Chapter2/Comparative_vir-nov/Dame_nanopore/DameNanopore_SB0206_canu.ME.fa.out.landscape.Div.Rclass.tab.sed.txt', header=F, sep="\t")
types <- SB0206_landscape$V1
SB0206_landscape$V1 <- NULL
rownames(SB0206_landscape) <- types
RC_DINE <- SB0206_landscape["DINE",] + SB0206_landscape["RC",]
to_remove <- c("DINE", "RC")
SB0206_landscape_edit <- SB0206_landscape[!row.names(SB0206_landscape)%in%to_remove,]
SB0206_landscape_edit2 <- rbind(SB0206_landscape_edit, RC_DINE)
# ok, got it.
Types <- c()
windows <- c()
Proportion <- c()
for (i in 1:nrow(SB0206_landscape_edit2)) {
Types <- c(Types, rep(rownames(SB0206_landscape_edit2)[i], times=40))
windows <- c(windows, c(1:40))
Proportion <- c(Proportion, as.numeric(SB0206_landscape_edit2[i,1:ncol(SB0206_landscape_edit2)]))
}
SB0206_landscape_df <- data.frame(Type=Types, Window=windows, Prop=Proportion)
# still need to divide by the appropriate number
SB0206_size <- 240772203
SB0206_landscape_df$Prop <- SB0206_landscape_df$Prop/SB0206_size
# remove the SINE
SB0206_landscape_df_mod <- SB0206_landscape_df[which(SB0206_landscape_df$Type!="SINE"),]
SB0206_landscape_df_mod$Type <- factor(SB0206_landscape_df_mod$Type, levels=c("DNA", "DINE", "LTR", "LINE", "Other", "Unknown"))
# stacked barplot
ggplot(SB0206_landscape_df_mod, aes(fill=Type, y=Prop, x=Window)) +
geom_bar(position="stack", stat="identity") +
scale_fill_manual(values = c("#33CC66", "#FF6600", "#9966FF", "#FF33CC", "#FFCC00", "#00CCFF")) +
#ylim(0,0.025) +
theme_bw()